{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "# Ch `05`: Concept `02`"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "## Segmentation"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "Import libraries and define hyper-parameters:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "import tensorflow as tf\n",
    "import numpy as np\n",
    "from bregman.suite import *\n",
    "\n",
    "k = 2\n",
    "segment_size = 50\n",
    "max_iterations = 100"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "Define functions to get the chromogram and the dataset:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "chromo = tf.placeholder(tf.float32)\n",
    "max_freqs = tf.argmax(chromo, 0)\n",
    "\n",
    "def get_chromogram(audio_file):\n",
    "    F = Chromagram(audio_file, nfft=16384, wfft=8192, nhop=2205)\n",
    "    return F.X\n",
    "\n",
    "def get_dataset(sess, audio_file):\n",
    "    chromo_data = get_chromogram(audio_file)\n",
    "    print('chromo_data', np.shape(chromo_data))\n",
    "    chromo_length = np.shape(chromo_data)[1]\n",
    "    xs = []\n",
    "    for i in range(chromo_length/segment_size):\n",
    "        chromo_segment = chromo_data[:, i*segment_size:(i+1)*segment_size]\n",
    "        x = extract_feature_vector(sess, chromo_segment)\n",
    "        if len(xs) == 0:\n",
    "            xs = x\n",
    "        else:\n",
    "            xs = np.vstack((xs, x))\n",
    "    return xs"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "As required for the k-means algorithm, specify the assignment and re-centering code:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "def initial_cluster_centroids(X, k):\n",
    "    return X[0:k, :]\n",
    "\n",
    "\n",
    "def assign_cluster(X, centroids):\n",
    "    expanded_vectors = tf.expand_dims(X, 0)\n",
    "    expanded_centroids = tf.expand_dims(centroids, 1)\n",
    "    distances = tf.reduce_sum(tf.square(tf.subtract(expanded_vectors, expanded_centroids)), 2)\n",
    "    mins = tf.argmin(distances, 0)\n",
    "    return mins\n",
    "\n",
    "\n",
    "def recompute_centroids(X, Y):\n",
    "    sums = tf.unsorted_segment_sum(X, Y, k)\n",
    "    counts = tf.unsorted_segment_sum(tf.ones_like(X), Y, k)\n",
    "    return sums / counts\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "Given a chromogram, extract a histogram of sound frequencies as our feature vector: "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "def extract_feature_vector(sess, chromo_data):\n",
    "    num_features, num_samples = np.shape(chromo_data)\n",
    "    freq_vals = sess.run(max_freqs, feed_dict={chromo: chromo_data})\n",
    "    hist, bins = np.histogram(freq_vals, bins=range(num_features + 1))\n",
    "    return hist.astype(float) / num_samples"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "In a session, segment an audio file using k-means:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python2.7/dist-packages/bregman/features_base.py:353: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future\n",
      "  mxnorm = P.empty(self._cqtN) # Normalization coefficients\n",
      "/usr/local/lib/python2.7/dist-packages/bregman/features_base.py:357: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future\n",
      "  for i in P.arange(self._cqtN)])\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "('chromo_data', (12, 633))\n",
      "(12, 12)\n",
      "('iteration', 50)\n",
      "('iteration', 100)\n",
      "('0.0m 0.0s', 0)\n",
      "('0.0m 2.5s', 1)\n",
      "('0.0m 5.0s', 0)\n",
      "('0.0m 7.5s', 1)\n",
      "('0.0m 10.0s', 1)\n",
      "('0.0m 12.5s', 1)\n",
      "('0.0m 15.0s', 1)\n",
      "('0.0m 17.5s', 0)\n",
      "('0.0m 20.0s', 1)\n",
      "('0.0m 22.5s', 1)\n",
      "('0.0m 25.0s', 0)\n",
      "('0.0m 27.5s', 0)\n"
     ]
    }
   ],
   "source": [
    "with tf.Session() as sess:\n",
    "    X = get_dataset(sess, 'TalkingMachinesPodcast.wav')\n",
    "    print(np.shape(X))\n",
    "    centroids = initial_cluster_centroids(X, k)\n",
    "    i, converged = 0, False\n",
    "    # prev_Y = None\n",
    "    while not converged and i < max_iterations:\n",
    "        i += 1\n",
    "        Y = assign_cluster(X, centroids)\n",
    "        # if prev_Y == Y:\n",
    "        #     converged = True\n",
    "        #     break\n",
    "        # prev_Y = Y\n",
    "        centroids = sess.run(recompute_centroids(X, Y))\n",
    "        if i % 50 == 0:\n",
    "            print('iteration', i)\n",
    "    segments = sess.run(Y)\n",
    "    for i in range(len(segments)):\n",
    "        seconds = (i * segment_size) / float(20)\n",
    "        min, sec = divmod(seconds, 60)\n",
    "        time_str = str(min) + 'm ' + str(sec) + 's'\n",
    "        print(time_str, segments[i])"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
